In [1]:
from sqlalchemy import create_engine
import pandas as pd
import pandas.io.sql as sqlio
import psycopg2
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
import seaborn as sns
from ast import literal_eval
In [2]:
#credentials anonymized
engine= create_engine('postgresql+psycopg2://postgres:pw@hostname:port/db_name')
In [3]:
df = pd.read_csv("Data/churn.csv")

df.columns = df.columns.str.lower()
df['first_paid_date'] = pd.to_datetime(df['first_paid_date'])
df['current_month'] = pd.to_datetime(df['month'])
df['current_month'] = df['current_month'].dt.strftime("%Y-%m")
df['cohort_month'] = df['first_paid_date'].dt.strftime("%Y-%m")
In [4]:
df = df.round({'throughput':0})
df['mtu'] = df['mtu'].fillna(0)
df['source_types']=df['source_types'].fillna('')
df['counts_by_source_type']=df['counts_by_source_type'].fillna('')
df['destination_names']=df['destination_names'].fillna('')
df['counts_by_destination_name']=df['counts_by_destination_name'].fillna('')
df['server_side_integration_calls'] = df['server_side_integration_calls'].fillna(0)
In [5]:
def literal_return(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val
    
df['source_types'] = df['source_types'].apply(literal_return)
df['counts_by_source_type']=df['counts_by_source_type'].apply(literal_return)
df['destination_categories']=df['destination_categories'].apply(literal_return)
df['destination_categories_counts']=df['destination_categories_counts'].apply(literal_return)
df['destination_names']=df['destination_names'].apply(literal_return)
df['counts_by_destination_name']=df['counts_by_destination_name'].apply(literal_return)
In [6]:
df['total_dest_cat_count'] = (
    df.destination_categories_counts.apply(lambda x: sum([int(e) for e in x]) if type(x) == list else 0)
)

df['total_sources_count'] = (
    df.counts_by_source_type.apply(lambda x: sum([int(e) for e in x]) if type(x) == list else 0)
)

df['total_dest_count'] = (
    df.counts_by_destination_name.apply(lambda x: sum([int(e) for e in x]) if type(x) == list else 0)
)
In [7]:
#sql manipulations after loading original churn data in to a postgres database table called churn.
sql_query=("""
    drop table if exists churn_info; 
    create table churn_info as
    with max_month as 
        (select u_customer_id,max(months_since_start) as max_month_available
        from churn 
        group by u_customer_id),
    first_month as 
        (select u_customer_id,payment_plan,first_paid_date
        from churn 
        where months_since_start = 0),
    first_churn as 
        (select u_customer_id,min(months_since_start) as first_churn
        from churn 
        where paying = 'FALSE' 
        group by u_customer_id),
    customer_info as
        (select t1.*,  
            t3.payment_plan as payment_plan_started, 
            t3.first_paid_date as cohort_month, 
            t2.paying as payment_status_end,
            t2.payment_plan as payment_plan_at_end
        from 
            max_month t1, churn t2, first_month t3
        where t1.u_customer_id = t2.u_customer_id 
        and t1.max_month_available = t2.months_since_start
        and t1.u_customer_id = t3.u_customer_id)
    select t1.*, t4.first_churn as first_churn_month
    from customer_info t1
        left join first_churn t4
        on t1.u_customer_id = t4.u_customer_id ;


    alter table churn_info add column payment_plan varchar(100);

    update churn_info 
    set payment_plan = payment_plan_at_end
    where payment_plan_at_end = payment_plan_started;

    update churn_info 
    set payment_plan = concat(payment_plan_started,'-',payment_plan_at_end)
    where payment_plan_at_end != payment_plan_started;

    alter table churn_info 
    add column prev_churn_status varchar(100),
    add column final_churn_status varchar(100);

    update churn_info 
    set final_churn_status = null;

    update churn_info 
    set final_churn_status = 'churned'
    where payment_status_end = 'FALSE';

    update churn_info 
    set final_churn_status = 'retained'
    where payment_status_end = 'TRUE';

    update churn_info set prev_churn_status = null;

    update churn_info 
    set prev_churn_status = 'never_churned'
    where first_churn_month is null;

    update churn_info 
    set prev_churn_status = 'reactivated'
    where first_churn_month is not null;


    update churn_info 
    set prev_churn_status = 'first_churn'
    where payment_status_end = 'FALSE'
    and first_churn_month = max_month_available;""",engine)
In [12]:
customer_info = pd.read_sql("""
    with churn_times as
        (select u_customer_id,count(months_since_start) no_of_churns
        from churn 
        where paying = 'FALSE'
        group by u_customer_id)
    select t1.*, t2.no_of_churns
    from churn_info t1
    left join churn_times t2
    on t1.u_customer_id = t2.u_customer_id;""",engine)
In [14]:
customer_info = customer_info.fillna(0)
customer_info['tenure'] = np.where(customer_info['prev_churn_status'] == 'reactivated',
                                           customer_info['first_churn_month'],customer_info['max_month_available'])
In [15]:
final_churn = customer_info.groupby(['payment_plan','final_churn_status']).agg({'u_customer_id':'count'}).reset_index()
final_churn['percent'] =  final_churn['u_customer_id'] / final_churn.groupby('payment_plan')['u_customer_id'].transform('sum')

final_churn['total_per_plan'] = final_churn.groupby('payment_plan')['u_customer_id'].transform('sum')
final_churn['percent_per_plan'] =  final_churn['total_per_plan'] / final_churn['u_customer_id'].sum()

final_churn.round({'percent':2,'percent_per_plan':2})
Out[15]:
payment_plan final_churn_status u_customer_id percent total_per_plan percent_per_plan
0 annual churned 56 0.49 115 0.03
1 annual retained 59 0.51 115 0.03
2 monthly churned 1974 0.58 3422 0.93
3 monthly retained 1448 0.42 3422 0.93
4 monthly-annual churned 18 0.14 132 0.04
5 monthly-annual retained 114 0.86 132 0.04
In [16]:
px.bar(final_churn,x='payment_plan',y='percent',barmode='group',title='Customers churn & retention in each payment plan',
                  labels={'percent':'% of customers'},height=400,width=700,
       color='final_churn_status',text='final_churn_status')
In [17]:
churn = customer_info.groupby(['payment_plan','prev_churn_status','final_churn_status']).agg({'u_customer_id':'count'}).reset_index()
churn.rename(columns={'u_customer_id':'customer_count'})
Out[17]:
payment_plan prev_churn_status final_churn_status customer_count
0 annual first_churn churned 55
1 annual never_churned retained 52
2 annual reactivated churned 1
3 annual reactivated retained 7
4 monthly first_churn churned 1766
5 monthly never_churned retained 1196
6 monthly reactivated churned 208
7 monthly reactivated retained 252
8 monthly-annual first_churn churned 18
9 monthly-annual never_churned retained 93
10 monthly-annual reactivated retained 21
In [18]:
cohort = pd.read_sql("""
with t1 as 
    (select u_customer_id, first_paid_date,months_since_start
        from churn where u_customer_id in 
        (select  u_customer_id from churn_info ci  where prev_churn_status = 'never_churned')
    union 
    select t1.u_customer_id, first_paid_date,months_since_start
        from churn t1, churn_info t2
    where t1.u_customer_id = t2.u_customer_id
    and t1.months_since_start < t2.first_churn_month 
    and t2.prev_churn_status != 'never_churned')
select first_paid_date,months_since_start,count(u_customer_id) 
from t1 
group by 1, 2""",engine)

#cohort = cohort.groupby(['first_paid_date','months_since_start']).agg(customer_count=('u_customer_id','count')).reset_index()
cohort = cohort.pivot(index='first_paid_date',columns='months_since_start',values='count')

retention = cohort.divide(cohort.iloc[:,0],axis=0)
retention = round(retention * 100)
In [19]:
plt.figure(figsize=(16,10))
sns.heatmap(retention.iloc[:, 0:21],annot=True,fmt='g')
plt.ylabel('cohort month')
Out[19]:
Text(123.0, 0.5, 'cohort month')
In [20]:
df = df.merge(customer_info[['u_customer_id','final_churn_status','prev_churn_status','no_of_churns','payment_plan_at_end','tenure']],on=['u_customer_id'],how='inner')
In [21]:
df.groupby(['payment_plan_at_end','prev_churn_status'])['mtu'].describe().round(0)
Out[21]:
count mean std min 25% 50% 75% max
payment_plan_at_end prev_churn_status
annual first_churn 1211.0 130507.0 319413.0 0.0 2134.0 27052.0 122232.0 3247053.0
never_churned 3770.0 160053.0 368117.0 0.0 12240.0 60914.0 153196.0 6248705.0
reactivated 661.0 122616.0 226852.0 0.0 4950.0 42329.0 132661.0 2162331.0
monthly first_churn 17722.0 7957.0 33620.0 0.0 9.0 444.0 4400.0 2511437.0
never_churned 31095.0 12434.0 27264.0 0.0 455.0 3091.0 12884.0 1384067.0
reactivated 8142.0 11277.0 30419.0 0.0 89.0 1732.0 9501.0 1011359.0
In [22]:
df.groupby(['payment_plan_at_end','final_churn_status'])['throughput'].describe().round(0)
Out[22]:
count mean std min 25% 50% 75% max
payment_plan_at_end final_churn_status
annual churned 1063.0 40606.0 821473.0 1.0 22.0 154.0 807.0 25668488.0
retained 4278.0 9901.0 522105.0 1.0 23.0 116.0 492.0 34132111.0
monthly churned 16384.0 56918.0 1848021.0 1.0 11.0 70.0 475.0 165798354.0
retained 34503.0 136958.0 7102134.0 1.0 16.0 88.0 502.0 619023924.0
In [23]:
df.groupby(['payment_plan_at_end','final_churn_status'])['all_integrations_count'].describe().round(0)
Out[23]:
count mean std min 25% 50% 75% max
payment_plan_at_end final_churn_status
annual churned 1070.0 19.0 18.0 1.0 8.0 15.0 24.0 214.0
retained 4315.0 25.0 22.0 1.0 9.0 18.0 33.0 184.0
monthly churned 17670.0 10.0 12.0 1.0 3.0 7.0 13.0 325.0
retained 35676.0 16.0 23.0 1.0 5.0 10.0 19.0 823.0
In [24]:
df.groupby(['payment_plan_at_end','final_churn_status'])['active_integrations_count'].describe().round(0)
Out[24]:
count mean std min 25% 50% 75% max
payment_plan_at_end final_churn_status
annual churned 1070.0 9.0 10.0 0.0 0.0 6.0 15.0 55.0
retained 4315.0 14.0 16.0 0.0 0.0 10.0 22.0 101.0
monthly churned 17670.0 4.0 7.0 0.0 0.0 0.0 5.0 168.0
retained 35676.0 8.0 11.0 0.0 0.0 4.0 11.0 334.0
In [25]:
df.groupby(['payment_plan_at_end','final_churn_status'])['total_events'].describe().round(0)
Out[25]:
count mean std min 25% 50% 75% max
payment_plan_at_end final_churn_status
annual churned 900.0 338.0 551.0 1.0 39.0 136.0 389.0 4603.0
retained 3877.0 357.0 624.0 1.0 44.0 145.0 410.0 10352.0
monthly churned 10372.0 161.0 365.0 1.0 11.0 39.0 142.0 6328.0
retained 23420.0 213.0 2070.0 1.0 16.0 55.0 174.0 162348.0
In [26]:
df.groupby(['payment_plan_at_end','final_churn_status'])['active_users'].describe().round(0)
Out[26]:
count mean std min 25% 50% 75% max
payment_plan_at_end final_churn_status
annual churned 1122.0 2.0 2.0 0.0 1.0 2.0 3.0 13.0
retained 4347.0 3.0 2.0 0.0 1.0 2.0 4.0 18.0
monthly churned 18590.0 1.0 1.0 0.0 0.0 1.0 1.0 17.0
retained 36037.0 1.0 1.0 0.0 0.0 1.0 2.0 18.0
In [27]:
df.groupby(['payment_plan_at_end','final_churn_status'])['warehouse_rows_synced'].describe().round(0)
Out[27]:
count mean std min 25% 50% 75% max
payment_plan_at_end final_churn_status
annual churned 1076.0 16619582.0 71363382.0 0.0 0.0 1284794.0 11341396.0 1.522950e+09
retained 4334.0 19600350.0 68591689.0 0.0 0.0 1288500.0 12092959.0 1.531019e+09
monthly churned 17988.0 987120.0 24266827.0 0.0 0.0 0.0 13123.0 2.516464e+09
retained 35861.0 3609984.0 242923957.0 0.0 0.0 0.0 295975.0 3.929108e+10
In [28]:
df.groupby(['payment_plan_at_end','prev_churn_status'])['all_integrations_count'].describe().round(0)
Out[28]:
count mean std min 25% 50% 75% max
payment_plan_at_end prev_churn_status
annual first_churn 1070.0 19.0 18.0 1.0 8.0 15.0 24.0 214.0
never_churned 3702.0 25.0 22.0 1.0 10.0 19.0 33.0 184.0
reactivated 613.0 22.0 22.0 1.0 7.0 15.0 27.0 130.0
monthly first_churn 15330.0 10.0 12.0 1.0 3.0 6.0 13.0 325.0
never_churned 30696.0 16.0 23.0 1.0 5.0 10.0 19.0 823.0
reactivated 7320.0 15.0 19.0 1.0 5.0 10.0 18.0 515.0
In [29]:
usage = df.groupby(['u_customer_id','prev_churn_status','final_churn_status','payment_plan_at_end'])[['all_integrations_count','active_users','total_events','mtu','throughput','all_sources_count']].median().reset_index()
In [30]:
usage = usage.merge(customer_info[['u_customer_id','tenure']],on='u_customer_id',how='inner')
In [31]:
fig = px.box(usage[usage['mtu']<300000],x='payment_plan_at_end',y='mtu',points='all',color='final_churn_status',
          title='MTU by Payment Plan',height=500,width=800)
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.show()
In [32]:
usage[usage['u_customer_id'].isin(['oycurahdsuuc','pcjprrldioxl','ejbnkfpdixdv'])]
Out[32]:
u_customer_id prev_churn_status final_churn_status payment_plan_at_end all_integrations_count active_users total_events mtu throughput all_sources_count tenure
678 ejbnkfpdixdv reactivated retained monthly 12.0 0.0 32.0 419.0 13.0 4.0 12.0
2211 oycurahdsuuc reactivated churned monthly 4.0 2.0 41.0 77.0 25.5 2.0 14.0
2230 pcjprrldioxl never_churned retained annual 47.5 2.0 213.5 52588.0 203.0 16.0 25.0
In [33]:
conditions = [
    (usage['mtu'] <= 10000),
    (usage['mtu'] > 10000) & (usage['mtu'] <= 25000),
    (usage['mtu'] > 25000) & (usage['mtu'] <= 100000),
    (usage['mtu'] > 100000)]

values = ['-10k','10k-25k', '25k-100k','100k+']
usage['mtu_cat'] = np.select(conditions, values)
In [34]:
usage_mtu = usage.groupby(['mtu_cat','payment_plan_at_end','final_churn_status']).agg({'u_customer_id':'count'}).reset_index()
In [35]:
fig = px.box(usage[usage['all_integrations_count']<100],x='mtu_cat',y='all_integrations_count',
             color='final_churn_status',facet_col='payment_plan_at_end',
             labels={'mtu_cat':'MTU category'},
            title = 'How complex integrations affect retention and churn?')
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
#fig.update_traces(boxpoints=False)
fig.show()
In [36]:
px.box(usage[usage['all_sources_count']<50],x='mtu_cat',y='all_sources_count',color='final_churn_status',facet_col='payment_plan_at_end')
In [37]:
#px.scatter(df[df['api_calls']<100000000],x='months_since_start',y='api_calls',facet_row='prev_churn_status')
fig = px.scatter(df[(df['prev_churn_status']=='reactivated') & (df['api_calls']<30000000)],
                 x='months_since_start',y='api_calls',facet_row='final_churn_status',
                 title = "Volume of API Calls of Reactivated Customers")
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_yaxes(title='')
fig.update_traces(marker=dict(color='green'))
In [40]:
fig = px.box(df[df['server_side_integration_calls']<500000],y='server_side_integration_calls',
             x='months_since_start',facet_row='final_churn_status',
             title = 'Distribution of Server Side Integration Calls Over Customer Tenure')
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_yaxes(title='')
#fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', yaxis_title=None)
In [41]:
px.box(df[df['anonymous_users']<100000],y='anonymous_users',x='months_since_start',facet_row='final_churn_status')
In [42]:
fig = px.box(df[df['total_dest_cat_count']<=50],y='total_dest_cat_count',x='months_since_start',
       facet_col='final_churn_status',
             labels={'total_dest_cat_count':'Total Destination Categories Count',
                    'months_since_start':'Month Since Start'})
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_yaxes(title='')
fig.show()
In [43]:
avg_tenure = pd.read_sql("""with t1 as 
        (select payment_plan_at_end,final_churn_status,prev_churn_status ,u_customer_id,max_month_available 
        from churn_info
            where prev_churn_status != 'reactivated'
            union 
            select payment_plan_at_end,final_churn_status,prev_churn_status,u_customer_id,first_churn_month 
            from churn_info ci 
            where prev_churn_status = 'reactivated')
       select payment_plan_at_end,final_churn_status,prev_churn_status,
        case 
        when prev_churn_status = 'first_churn' and final_churn_status = 'churned' then final_churn_status
        when prev_churn_status = 'never_churned' and final_churn_status = 'retained' then final_churn_status
        when prev_churn_status = 'reactivated' and final_churn_status = 'churned' then concat(prev_churn_status,'-',final_churn_status)
        else concat(prev_churn_status,'-',final_churn_status) 
        end as churn_status,
        round(avg(max_month_available),0) as avg from t1
        group by payment_plan_at_end,final_churn_status,prev_churn_status;;""",engine)
avg_tenure
Out[43]:
payment_plan_at_end final_churn_status prev_churn_status churn_status avg
0 annual churned first_churn churned 16.0
1 annual churned reactivated reactivated-churned 4.0
2 annual retained never_churned retained 25.0
3 annual retained reactivated reactivated-retained 11.0
4 monthly churned first_churn churned 9.0
5 monthly churned reactivated reactivated-churned 5.0
6 monthly retained never_churned retained 25.0
7 monthly retained reactivated reactivated-retained 8.0
In [44]:
fig = px.bar(avg_tenure,x='churn_status',y='avg',barmode='group',
             facet_col='payment_plan_at_end',text='avg',labels={'avg':'Avg Tenure'})
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
fig.update_xaxes(title='')
In [ ]: